Web Scraping Website Best Options

by: CaitlynEmma, 7 years ago


Hi there everyone,

First off I am fairly new to programming but have managed to get my program to extract data from websites such as (https://citydev-portal.edinburgh.gov.uk/idoxpa-web/search.do?action=advanced ) using Selenium and openpyxl .  With the program, I am inputting - keyword description, the application types I want to search through, and the date range; with the additional factor of looping through all of the application decisions.

My main issue is, with the larger number of applications (for example this site gives me over 500 for 6 months), the program is just slow and also crashes.

I was wondering if you have any recommendations for improving the program if this includes a mixture of selenium with something else I would love to take in any advice that you may have.

Below is the code that I have been using, and yes I am working on writing everything into functions, I am personally just glad that it works at this point.

#Url to grab information off of
mainPage = "https://citydev-portal.edinburgh.gov.uk/idoxpa-web/search.do?action=advanced"

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from openpyxl import Workbook
from openpyxl import load_workbook

description = ["window"]
ApplicationType = [0]
startDate = "01/05/2015"
endDate = "31/08/2015"
error = False
year = 2015

#Excel
wb = load_workbook('/Users/Caitlyn/Desktop/Program/Planning.xlsx')
sheet = wb.get_sheet_by_name('Data')
rowIndex = 2
columnIndex = 1

driver = webdriver.Chrome('/Users/Caitlyn/Downloads/chromedriver')
driver.get(mainPage)

# placing each description here
sbox = driver.find_element_by_id("description")
sheet.cell(row=rowIndex, column=columnIndex, value=description[0])
columnIndex = columnIndex + 1
sbox.send_keys(description[0])

# here is where you will cycle through dates
sbox = driver.find_element_by_id("applicationDecisionStart")
sbox.send_keys(startDate)
sbox = driver.find_element_by_id("applicationDecisionEnd")
sbox.send_keys(endDate)
sheet.cell(row=rowIndex, column=columnIndex, value=year)
columnIndex = columnIndex + 1

# here is were you will cycle through Application types
applicationBox = Select(driver.find_element_by_id("caseType"))
# here is were you will cycle through Decisions
decisionBox = Select(driver.find_element_by_id("caseDecision"))
for appIndex in ApplicationType:
  applicationBox = Select(driver.find_element_by_id("caseType"))
  print("AppIndex: %d" % appIndex)
  applicationBox.select_by_index(appIndex)
  applicationType = applicationBox.options[appIndex].text
  sheet.cell(row=rowIndex, column=columnIndex, value=applicationType)
  columnIndex = columnIndex + 1

  isallzero = False
  decisionBox = Select(driver.find_element_by_id("caseDecision"))
  for index in range(len(decisionBox.options)):
    if isallzero:
        print("breaking")
        break
    pageIndex = 1
    pageResults = 0

    decisionBox = Select(driver.find_element_by_id("caseDecision"))
    decisionBox.select_by_index(index)
    decision = decisionBox.options[index].text
    sheet.cell(row=rowIndex, column=columnIndex, value=decision)
    #space for results
    columnIndex = columnIndex + 2
    submit = driver.find_element_by_css_selector(".button.primary")
    submit.click()

    try:
      driver.find_element_by_class_name("messagebox")
      if index == 0:
          print("True")
          isallzero = True
    except NoSuchElementException:
      error = True

    if error:
      resultsRow = rowIndex
      while True:
        column = driver.find_elements_by_class_name("searchresult")
        pageResults = pageResults + len(column)
        for element in column:
          list = element.text.split("n")
          sheet.cell(row=rowIndex, column=columnIndex, value=list[0])
          columnIndex = columnIndex + 1
          sheet.cell(row=rowIndex, column=columnIndex, value=list[1])
          columnIndex = columnIndex + 1
          sheet.cell(row=rowIndex, column=columnIndex, value=list[2].split("|")[0])
          columnIndex = columnIndex - 2
          rowIndex = rowIndex + 1
        try:
          submit = driver.find_element_by_class_name("next")
          submit.click()
          pageIndex = pageIndex + 1
        except NoSuchElementException:
          break

      columnIndex = columnIndex - 1
      sheet.cell(row=resultsRow, column=columnIndex, value=pageResults)
      columnIndex = columnIndex - 1
    else:
      columnIndex = columnIndex - 1
      sheet.cell(row=rowIndex, column=columnIndex, value=0)
      columnIndex = columnIndex - 1
      rowIndex = rowIndex + 1

    driver.execute_script("window.history.go(-%d)" % pageIndex)
    error = False

  columnIndex = columnIndex - 1

wb.save('/Users/Caitlyn/Desktop/Program/Planning.xlsx')




You must be logged in to post. Please login or register an account.